{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 下载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "import shutil\n",
    "from pathlib import Path\n",
    "import numpy as np \n",
    "import pandas as pd \n",
    "from joblib import Parallel, delayed\n",
    "import re \n",
    "from typing import List, Dict\n",
    "import requests\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_images(url:str, image_save_path:Path) -> None:\n",
    "    if not image_save_path.exists():\n",
    "        try:\n",
    "            web = requests.get(url)\n",
    "            web_status = web.status_code\n",
    "            if web_status == 200:\n",
    "                with open(image_save_path, mode='wb') as f:\n",
    "                    f.write(web.content)\n",
    "\n",
    "        except Exception as e:\n",
    "            pass\n",
    "\n",
    "# download_images(url=url, image_save_path=Path(\"bigdata/image_data/2.png\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "image_save_dir = \"bigdata/image_data\"\n",
    "\n",
    "os.makedirs(name=image_save_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Path(image_save_dir).joinpath(\"hello.png\")\n",
    "test_data = pd.read_csv(\"bigdata/raw_data/test-2.6w.csv\")\n",
    "train_data = pd.read_csv(\"bigdata/raw_data/train-137w.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>url</th>\n",
       "      <th>filename</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://gimg2.baidu.com/image_search/src=http%...</td>\n",
       "      <td>train-0.jpg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>https://gss0.baidu.com/70cFfyinKgQFm2e88IuM_a/...</td>\n",
       "      <td>train-1.jpg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>https://gimg2.baidu.com/image_search/src=http%...</td>\n",
       "      <td>train-4.jpg</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 url     filename\n",
       "0  https://gimg2.baidu.com/image_search/src=http%...  train-0.jpg\n",
       "1  https://gss0.baidu.com/70cFfyinKgQFm2e88IuM_a/...  train-1.jpg\n",
       "2  https://gimg2.baidu.com/image_search/src=http%...  train-4.jpg"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_image_data = pd.concat([train_data[['url', 'filename']], test_data[['url', 'filename']]]).reset_index(drop=True)\n",
    "all_image_data.head(3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "smalldata = all_image_data.copy()#.sample(20).reset_index(drop=True)\n",
    "# smalldata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "350"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chunk_size = 4000\n",
    "start_id = range(0, smalldata.shape[0], chunk_size)\n",
    "len(list(start_id))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# smalldata[4:(4 + chunk_size)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "main---->: 100%|██████████| 350/350 [13:40<00:00,  2.34s/it]\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(start_id, desc='main---->'):\n",
    "    temp_data = smalldata[i:(i+chunk_size)].copy().reset_index(drop=True)\n",
    "    _ = Parallel(n_jobs=10)(delayed(download_images)(row.url, Path(image_save_dir).joinpath(row.filename)) for index, row in temp_data.iterrows())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mynet2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "110bc624a448454d574a0cd6cc76359fd86f75739e493913b3d71c2e04f2ffdb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
